package com.metis.document.parse.dialog.pipeline.impl;

import com.metis.document.parse.dialog.pipeline.TextPostProcess;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.NotNull;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;

/**
 * SectionRegPostProcess
 *
 * 针对正文 的正则补充处理
 *
 * 20240319
 * 连续空格+下划线样式，是否去除连续空格,去除 ”PAGE   \* MERGEFORMAT 3“
 */
@Slf4j
@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
public class SectionRegPostProcess implements TextPostProcess {
    @Override
    public String process(@NotNull String input) {
        return input
                .replaceAll("(?<=\\d)\\s*(?=[\\u4e00-\\u9fa5])|(?<=[\\u4e00-\\u9fa5])\\s*(?=\\d)", "")
                .replaceAll("(?<=\\d)\\s*(?=[,，])|(?<=[,，])\\s*(?=\\d)", "")
                .replaceAll("(:|：)( |_)+|\\s{2,}|_{2,}","")
                .replaceAll("\\s*PAGE\\s*\\\\\\*\\s*\\s*MERGEFORMAT\\s*3\\s*","")
                .replaceAll("[目]\\s*[录]|[目]\\s*[次]","")
                ;
    }
}


